import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import math
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import LeaveOneOut

from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import cross_validate

import warnings


#define the functions for the skill scores
def HK_skill_score(A, B, C, D):
    HK = (A * D - C * B) / ((A + B) * (C + D))
    return HK


def Accuracy(A, B, C, D):
    Acc = (A + D) / (A + B + C + D)
    return Acc


def Balanced_Accuracy(A, B, C, D):
    Bal_Acc = ((A / (A + B)) + (D / (C + D))) / 2
    return Bal_Acc

# to test 2020 or 2022, use the correct dataset, and switch which dates are added halfway throughout the model

# 2022 data
indicator_data = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_data_only.csv')
df_timetorain = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/time_to_rain.csv')
del indicator_data['Date']
del df_timetorain['datetime']

#optional to delete (see thesis)
del indicator_data['ants_carry_food_to_hole']



# 2020 data only
indicator_data = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_data_only_2020.csv')
df_timetorain = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/time_to_rain_2020.csv')
del df_timetorain['datetime']
df_timetorain = df_timetorain['sameday']
del indicator_data['Date']

# optional to delete (see thesis)
del indicator_data['Duck']
del indicator_data['mosquito']
del indicator_data['Other']

del indicator_data['ants_carry_food_to_hole']

#Load and pre-process the individual forecasts before use
# load the Meteoblue forecast
MB_prob_prec = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/MB_prob_prec_total.csv')
MB_prob_prec['datetime'] = pd.to_datetime(MB_prob_prec['datetime'])
MB_prob_prec = MB_prob_prec.rename(columns={'value_x': 'MB_rain_amount', 'value_y': 'MB_rain_prob'})
del MB_prob_prec['cat_x']
del MB_prob_prec['cat_y']
for i in MB_prob_prec.index:
    if MB_prob_prec.loc[i, 'MB_rain_prob'] < 60:
        MB_prob_prec.loc[i, 'MB_rain_amount_filt'] = 0
    if MB_prob_prec.loc[i, 'MB_rain_prob'] > 60:
        MB_prob_prec.loc[i, 'MB_rain_amount_filt'] = MB_prob_prec.loc[i, 'MB_rain_amount']
    if np.isnan(MB_prob_prec.loc[i, 'MB_rain_prob']) == True:
        if MB_prob_prec.loc[i, 'MB_rain_amount'] < 1:
            MB_prob_prec.loc[i, 'MB_rain_amount_filt'] = 0
        else:
            MB_prob_prec.loc[i, 'MB_rain_amount_filt'] = MB_prob_prec.loc[i, 'MB_rain_amount']
del MB_prob_prec['MB_rain_prob']
del MB_prob_prec['MB_rain_amount']

# add farmer observations
farmer_observations = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/Farmer_observations.csv')
farmer_observations['datetime'] = pd.to_datetime(farmer_observations['datetime'])
farmer_observations = farmer_observations.rename(columns={'value': 'Farmer_obs'})

#Make the obervations binary (either it rained or it didn't)
for i in farmer_observations.index:
    if farmer_observations.loc[i, 'Farmer_obs'] > 0:
        farmer_observations.loc[i, 'Farmer_obs'] = int(1)
    else:
        farmer_observations.loc[i, 'Farmer_obs'] = int(0)

# add farmer predictions (only good farmers from Nakpanzoo, Nabogu or Yapalsi)
farmer_forecast = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/farmer_forecast_combined.csv')
farmer_forecast['datetime'] = pd.to_datetime(farmer_forecast['Unnamed: 0'])
del farmer_forecast['Unnamed: 0']

for day,dayname in enumerate(df_timetorain):
    print(day)

#Here, the model starts

#this only existed for testing purposes
list = [0]

#Intialise the random state list
list_random = [2,3,6,42]
# list_random = [2]

#The model is looped over all random states, to produce a maximum of 16 results if 4 random states are specified
for r in list_random:
    for p in list_random:
        for m in list:
            # Initialise the score dataframes
            ML_results_ind_kfold_probability = pd.DataFrame(index=np.arange(len(indicator_data)))
            ML_results_ind_kfold = pd.DataFrame(index=np.arange(1))
            ML_results_kfold = pd.DataFrame(index=np.arange(1))
            ML_results_kfold = ML_results_kfold.rename(index={0: 'Accuracy'})

            # Specify which ML models are used in the Voting Classifier (VC) algorithm
            clf2 = RandomForestClassifier(random_state=r, n_estimators=100)
            clf3 = BernoulliNB()
            clf4 = SVC(probability=True, random_state=r)

            # Make the list of ML models tested. Uncomment models you want to leave out.
            models = []
            models.append(('BNB', BernoulliNB()))
            models.append(('RF', RandomForestClassifier(max_features=1, random_state=r, n_estimators=100)))
            models.append(('SVM', SVC(gamma='auto',probability=True,random_state= r)))
            models.append(('VC', VotingClassifier(estimators=[('RF', clf2), ('BNB', clf3), ('SVC', clf4)], voting='soft',weights=[2, 1, 1])))
            models.append(('NN', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=r, max_iter=5000)))

            warnings.filterwarnings("ignore", category=UserWarning, message="y_pred contains classes not in y_true")
            # Loop over all the models
            for name, model in models:
                X = indicator_data
                #I build this for loop in to test wheather or not it would help to use multiple days, but it didn't improve performance.
                #So leave it out if you want to reproduce the results. See the ML loop for reference how it should look without this functionality.
                for day,dayname in enumerate(df_timetorain):
                    Y = df_timetorain.iloc[:,day]
                    #Use either LOOCV or Kfold splitting
                    # kfold = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
                    loocv = LeaveOneOut()
                    cv_score = cross_val_score(model, X, Y.values.ravel(), cv=loocv, scoring='accuracy')
                    cv_predict = cross_val_predict(model, X, Y, cv=loocv, method='predict_proba')
                    # df_timetorain = pd.DataFrame(df_timetorain.iloc[:,day])
                    # df_timetorain['test_result'] = cv_score
                    #Calculate the HK score /accuracy of the indicator model (can turn of if not interested
                    A = 0
                    B = 0
                    C = 0
                    D = 0
                    for e in df_timetorain.index:
                        if df_timetorain.loc[e, 'sameday'] == 1:
                            if df_timetorain.loc[e, 'test_result'] == 1:
                                A += 1
                            if df_timetorain.loc[e, 'test_result'] == 0:
                                B += 1
                        if df_timetorain.loc[e, 'sameday'] == 0:
                            if df_timetorain.loc[e, 'test_result'] == 0:
                                C += 1
                            if df_timetorain.loc[e, 'test_result'] == 1:
                                D += 1
                    df_timetorain = df_timetorain['sameday']
                    ML_results_ind_kfold.loc['Accuracy', name] = np.mean(cv_score)
                    ML_results_ind_kfold.loc['HK_score', name] = HK_skill_score(A,B,C,D)
                    #Here the probabilities of the ML model are saved, which is the most important part
                    ML_results_ind_kfold_probability.loc[:,'probability_rain_'+ name + str(day)] = cv_predict[:,1]
                    ML_results_ind_kfold_probability.loc[:,'probability_no_rain_' + name + str(day)] = cv_predict[:,0]

                # Switch these if 2020 or 2022 is used
                # indicator_data = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_data_only.csv')
                indicator_data = pd.read_csv('C:/Users/joepb/PycharmProjects/data_storage/indicator_data_only_2020.csv')
                ML_results_ind_kfold_probability['datetime'] = indicator_data['Date']
                ML_results_ind_kfold_probability['datetime'] = pd.to_datetime(ML_results_ind_kfold_probability['datetime'])
                del indicator_data['Date']
                    # inidcator probability -> not sure what this does anymore... But it was commented out so not necessary
                    # best_ind_model = ML_results_ind.idxmax(axis=1).loc['Combined_scores']
                    # best_ind_results = pd.DataFrame()
                    # best_ind_results['probability_no_rain'] =ML_results_probability.loc[:,'probability_no_rain_' + best_ind_model]
                    # best_ind_results['probability_rain'] = ML_results_probability.loc[:,'probability_rain_' + best_ind_model]

                    # ML_results_specific = pd.DataFrame(index=np.arange(len(farmer_obs_model)), columns=np.arange(0))


            # Here the second part of the model starts.
            for name, model in models:
                # Again, if the normal model needs to be used, only the 0 day is useful
                best_ind_results = pd.DataFrame()
                best_ind_results['probability_no_rain_day0'] = ML_results_ind_kfold_probability.loc[:, 'probability_no_rain_' + name+ str(0)]
                best_ind_results['probability_rain_day0'] = ML_results_ind_kfold_probability.loc[:, 'probability_rain_' + name+ str(0)]
                best_ind_results['probability_no_rain_day1'] = ML_results_ind_kfold_probability.loc[:, 'probability_no_rain_' + name+ str(1)]
                best_ind_results['probability_rain_day1'] = ML_results_ind_kfold_probability.loc[:, 'probability_rain_' + name+ str(0)]
                best_ind_results['probability_no_rain_day2'] = ML_results_ind_kfold_probability.loc[:, 'probability_no_rain_' + name+ str(2)]
                best_ind_results['probability_rain_day2'] = ML_results_ind_kfold_probability.loc[:, 'probability_rain_' + name+ str(2)]


                # best_ind_results['total_probability'] = ML_results_probability.loc[:,'total_probability_'+ name]
                best_ind_results['datetime'] = pd.to_datetime(ML_results_ind_kfold_probability['datetime'])

                # cut out all data for which we have no farmer observations, by merging all datasets into one dataset
                all_predictions_1 = MB_prob_prec.merge(best_ind_results, left_on='datetime', right_on='datetime',
                                                       how='outer')
                all_predictions_2 = all_predictions_1.merge(farmer_forecast, left_on='datetime', right_on='datetime',
                                                            how='outer')

                predicitons_with_farmer_obs = all_predictions_2.merge(farmer_observations, left_on='datetime',
                                                                      right_on='datetime', how='inner')
                predicitons_with_farmer_obs.dropna(axis=0, how='any', inplace=True)

                predctions_without_farmer_obs = predicitons_with_farmer_obs.drop('Farmer_obs', axis=1)
                predctions_without_farmer_obs.reset_index(inplace=True)
                del predctions_without_farmer_obs['index']

                farmer_obs_model = pd.DataFrame()
                farmer_obs_model['0'] = predicitons_with_farmer_obs['Farmer_obs']
                farmer_obs_model.reset_index(inplace=True)
                del farmer_obs_model['index']
                del predctions_without_farmer_obs['datetime']

                # Load the models you want to use for the integration of forecasts
                clf2_1 = RandomForestClassifier(random_state=p, n_estimators=100)
                clf3_1 = BernoulliNB()
                clf4_1 = SVC(probability=True, random_state=p)
                models1 = []
                # models1.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
                # models1.append(('LDA', LinearDiscriminantAnalysis()))
                models1.append(('KNN', KNeighborsClassifier()))
                models1.append(('RF', RandomForestClassifier(max_features=1, random_state=p, n_estimators=100)))
                models1.append(('SVM', SVC(gamma='auto', random_state=p)))
                models1.append(('VC',
                                VotingClassifier(estimators=[('RF', clf2_1), ('BNB', clf3_1), ('SVC', clf4_1)], voting='soft',
                                                 weights=[2, 1, 1])))
                models1.append(('NN', MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=p,
                                                    max_iter=50000000)))

                # This is the same structure as before
                for name1, model1 in models1:
                    X1 = predctions_without_farmer_obs
                    Y1 = farmer_obs_model
                    kfold = StratifiedKFold(n_splits=10, random_state=2, shuffle=True)
                    loocv = LeaveOneOut()
                    # cv_score_2 = cross_val_score(model1, X, Y.values.ravel(), cv=kfold, scoring='balanced_accuracy')
                    # cv_score_2_bal_acc= cross_val_score(model1, X1, Y1.values.ravel(), cv=loocv, scoring='balanced_accuracy')
                    cv_score_2_acc = cross_val_score(model1, X1, Y1.values.ravel(), cv=loocv, scoring='accuracy')
                    farmer_obs_model['test_result'] = cv_score_2_acc
                    A=0
                    B=0
                    C=0
                    D=0
                    for e in farmer_obs_model.index:
                        if farmer_obs_model.loc[e, '0'] == 1:
                            if farmer_obs_model.loc[e,'test_result'] == 1:
                                A += 1
                            if farmer_obs_model.loc[e,'test_result'] == 0:
                                B += 1
                        if farmer_obs_model.loc[e, '0'] == 0:
                            if farmer_obs_model.loc[e,'test_result'] == 0:
                                C += 1
                            if farmer_obs_model.loc[e,'test_result'] == 1:
                                D += 1
                    del farmer_obs_model['test_result']
                    # ML_results_kfold.loc['Accuracy', name + name1] = np.mean(cv_score_2)
                    # ML_results_kfold.loc['std', name + name1] = np.std(cv_score_2)
                    ML_results_kfold.loc['Accuracy', name + name1 + str(r) + str(p)] = np.mean(cv_score_2_acc)
                    ML_results_kfold.loc['HK_score', name + name1 + str(r) + str(p)] = HK_skill_score(A,B,C,D)

#First Copy the results into a new variable to safely store them
ML_results_with_acc = ML_results.copy()
ML_results = ML_results.drop('Accuracy')

ML_results_with_bal_acc = ML_results.copy()
ML_results = ML_results.drop('Balanced_accuracy')

#Plot the results
fig = plt.figure()
ax = fig.add_subplot(111)
Y = ML_results_kfold.loc['HK_score', :]
X = ML_results_kfold.loc['Accuracy', :]

std_dev_rndm_test = pd.DataFrame(index = np.arange(1))
ML_results_kfold_VCRF = ML_results_kfold.filter(regex='VCRF', axis=1)


model_name_list = []
for name,model in models:
    for name1,model1 in models1:
        std_dev_rndm_test[name+name1] = np.std((ML_results_kfold.filter(regex=name+name1, axis=1).loc['HK_score']))


for name,model in models:
    for name1,model1 in models1:
        fig = plt.figure()
        ax = fig.add_subplot(111)
        X = ML_results_kfold.filter(regex=name + name1, axis=1).loc['Accuracy']
        Y = ML_results_kfold.filter(regex=name+name1, axis=1).loc['HK_score']
        plt.plot(X, Y, 'bx')  # Plotting data
        # plt.xticks(X, Y) # Redefining x-axis labels
        storage = []
        for j in ML_results_kfold.filter(regex=name+name1, axis=1).columns:
            v,i = ML_results_kfold.filter(regex=name+name1, axis=1).loc[:, j].values.tolist()
            storage += [i + v]
            if storage.count(i + v) > 2:
                ax.annotate(j, xy=(v, i), xytext=(20, -20), textcoords='offset pixels',
                            arrowprops=dict(arrowstyle="->", color="0.5",
                                            shrinkA=5, shrinkB=5,
                                            patchA=None, patchB=None,
                                            connectionstyle="arc3,rad=0.",
                                            ))
            elif storage.count(i + v) > 1:
                ax.annotate(j, xy=(v, i), xytext=(-20, -40), textcoords='offset pixels',
                            arrowprops=dict(arrowstyle="->", color="0.5",
                                            shrinkA=5, shrinkB=5,
                                            patchA=None, patchB=None,
                                            connectionstyle="arc3,rad=0.",
                                            ), )
            else:
                ax.annotate(j, xy=(v, i), xytext=(-20, 4), textcoords='offset pixels')

        #
        # plt.scatter(X,Y)
        # plt.xticks(X_axis, X)
        plt.xlabel('Accuracy')
        plt.ylabel("HK skill score")
        # plt.title("Number of Students in each group")
        # plt.legend()
        plt.savefig(
            'C:/Users/joepb/OneDrive/Documenten/Wageningen - Uni/Master Thesis/Draft thesis figures and docs/vergelijking_methods/'+(name+name1) +'.png',
            bbox_inches='tight')
        # plt.show()